Environment
library(tidyr)
library(plyr)
library(dplyr)
library(magrittr)
library(ggplot2)
library(stringr)
options(scipen = 999)
Load Data
df <- read.csv("https://raw.githubusercontent.com/x-atlas-consortia/hra-pop/main/output-data/v0.10.2/reports/atlas/validation-v5.csv", header=T)
Prepare data
# Remove "sc_proteomics"
df <- df[df$tool != "sc_proteomics",]
# Prepare dataset
#df$dataset <- factor(df$dataset)
# Prepare organ labels
df$organ <- df$organ %>%
str_replace("right ", "") %>%
str_replace("left ", "") %>%
str_replace("Set of ", "") %>%
str_replace(" of body", "") %>%
str_to_title() %>%
str_trim()
df$organ <- factor(df$organ)
# Prepare Cell Labels
df$cell_label <- df$cell_label %>%
str_to_upper()
# Prepare tool
df$tool <- df$tool %>%
str_to_title()
df$tool <- factor(df$tool)
#ct <- unique(df$cell_label)
#write.csv(ct, file=paste0(getwd(),"/ct_pop_cellTypeList.csv"), row.names=F)
Creating labels for bar plots
# Extract anatomical structures from dataset URI
df$structure <- NA
df$structure[!grepl("https://doi.org/",df$dataset) &
grepl('#',df$dataset)] <-
df$dataset[!grepl("https://doi.org/",df$dataset) &
grepl('#',df$dataset)] %>%
str_split_i(pattern="\\$", i=-1) %>%
str_replace_all(pattern="\\%20"," ") %>%
str_replace(pattern="heart ","") %>%
str_to_title()
# Labels for datasets
df$label <- NA
## Labels datasets without structures
df[is.na(df$structure),]$label <-
paste0(df[is.na(df$structure),]$organ,"-",
df[is.na(df$structure),]$dataset)
## Labels for datasets with structures
df[!is.na(df$structure),]$label <-
paste0(df[!is.na(df$structure),]$organ,"-",
df[!is.na(df$structure),]$structure,"-",
df[!is.na(df$structure),]$dataset)
# Factor label and dataset vars
df$label <- as.factor(df$label)
df$dataset <- as.factor(df$dataset)
Pivot table
# Organ Cell Type Percentages
piv <- df %>%
select(organ, cell_label, percentage) %>%
ddply(.(organ,cell_label), summarise,
mean_per = mean(percentage),
sd_per = sd(percentage))
#write.csv(piv, file=paste0(getwd(),"/ct_pop_organ_celltype_meanPercent.csv"),
# row.names = F)
# Cell counts for each organ
piv2 <- piv %>%
count(organ) %>%
arrange(desc(n))
names(piv2)[2] <- "cells"
Stacked Bar Graphs
Heart Datasets
hrt %>%
ggplot(aes(y=label, x=percentage)) +
geom_col(aes(fill=cell_label)) +
facet_grid(rows = vars(tool)) +
scale_y_discrete(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
guides(fill="none") +
labs(title="Heart Datasets - Top 10 Cell Types, by Percentage & Tool",
x="Percent",y="Dataset")

Respiratory System Datasets
lng %>%
ggplot(aes(y=label, x=percentage)) +
geom_col(aes(fill=cell_label)) +
facet_grid(rows = vars(tool)) +
scale_y_discrete(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
guides(fill="none") +
labs(title="Respiratory System Datasets - Top 10 Cell Types, by Percentage & Tool",
x="Percent",y="Dataset")

Liver Datasets
liv %>%
ggplot(aes(y=label, x=percentage)) +
geom_col(aes(fill=cell_label)) +
facet_grid(rows = vars(tool)) +
scale_y_discrete(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
guides(fill="none") +
labs(title="Liver Datasets - Top 10 Cell Types, by Percentage & Tool",
x="Percent",y="Dataset")

Lactiferous Glands In Breast Datasets
lgb %>%
ggplot(aes(y=label, x=percentage)) +
geom_col(aes(fill=cell_label)) +
facet_grid(rows = vars(tool)) +
scale_y_discrete(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
guides(fill="none") +
labs(title="Lactiferous Glands In Breast Datasets - Top 10 Cell Types, by Percentage & Tool",
x="Percent",y="Dataset")

Kidney Datasets
kid %>%
ggplot(aes(y=label, x=percentage)) +
geom_col(aes(fill=cell_label)) +
facet_grid(rows = vars(tool)) +
scale_y_discrete(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
guides(fill="none") +
labs(title="Kidney Datasets - Top 10 Cell Types, by Percentage & Tool",
x="Percent",y="Dataset")

Large Intestine Datasets
li %>%
ggplot(aes(y=label, x=percentage)) +
geom_col(aes(fill=cell_label)) +
facet_grid(rows = vars(tool)) +
scale_y_discrete(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
guides(fill="none") +
labs(title="Large Intestine Datasets - Top 10 Cell Types, by Percentage & Tool",
x="Percent",y="Dataset")

Small Intestine Datasets
si %>%
ggplot(aes(y=label, x=percentage)) +
geom_col(aes(fill=cell_label)) +
facet_grid(rows = vars(tool)) +
scale_y_discrete(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
guides(fill="none") +
labs(title="Small Intestine Datasets - Top 10 Cell Types, by Percentage & Tool",
x="Percent",y="Dataset")

Skin Datasets
skin %>%
ggplot(aes(y=label, x=percentage)) +
geom_col(aes(fill=cell_label)) +
facet_grid(rows = vars(tool)) +
scale_y_discrete(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
guides(fill="none") +
labs(title="Skin Datasets - Top 10 Cell Types, by Percentage & Tool",
x="Percent",y="Dataset")

Spleen Datasets
spl %>%
ggplot(aes(y=label, x=percentage)) +
geom_col(aes(fill=cell_label)) +
facet_grid(rows = vars(tool)) +
scale_y_discrete(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
guides(fill="none") +
labs(title="Spleen Datasets - Top 10 Cell Types, by Percentage & Tool",
x="Percent",y="Dataset")

Ureter Datasets
ute %>%
ggplot(aes(y=label, x=percentage)) +
geom_col(aes(fill=cell_label)) +
facet_grid(rows = vars(tool)) +
scale_y_discrete(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
guides(fill="none") +
labs(title="Ureter Datasets - Top 10 Cell Types, by Percentage & Tool",
x="Percent",y="Dataset")

Male Reproductive System Datasets
mrs %>%
ggplot(aes(y=label, x=percentage)) +
geom_col(aes(fill=cell_label)) +
facet_grid(rows = vars(tool)) +
scale_y_discrete(expand=c(0,0)) +
scale_x_continuous(expand=c(0,0)) +
guides(fill="none") +
labs(title="Male Reproductive System - Top 10 Cell Types, by Percentage & Tool",
x="Percent",y="Dataset")
